Package org.terrier.structures.indexing.singlepass.hadoop

Source Code of org.terrier.structures.indexing.singlepass.hadoop.MultiFileCollectionInputFormat

/*
* Terrier - Terabyte Retriever
* Webpage: http://terrier.org
* Contact: terrier{a.}dcs.gla.ac.uk
* University of Glasgow - School of Computing Science
* http://www.gla.uk
*
* The contents of this file are subject to the Mozilla Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS"
* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
* the License for the specific language governing rights and limitations
* under the License.
*
* The Original Code is MultiFileCollectionInputFormat.java.
*
* The Original Code is Copyright (C) 2004-2011 the University of Glasgow.
* All Rights Reserved.
*
* Contributor(s):
*   Richard McCreadie <richardm{a.}dcs.gla.ac.uk> (original author)
*   Craig Macdonald <craigm{a.}dcs.gla.ac.uk>
*/
package org.terrier.structures.indexing.singlepass.hadoop;

import gnu.trove.TObjectLongProcedure;
import gnu.trove.TObjectLongHashMap;

import java.io.IOException;
import java.lang.reflect.Array;
import java.util.Arrays;
import java.util.Comparator;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MultiFileInputFormat;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.lib.CombineFileSplit;
import org.apache.log4j.Logger;
import org.terrier.indexing.Document;

/**
* Input Format Class for Hadoop Indexing. Splits the input collection into
* sets of files where each Map task gets about the same number of files.
* Files are assumed to be un-splittable and are not split. Splits are of
* adjacent files - i.e. split 0 always has the first file, and the last
* split always has the last file. Any given split will have adjacent files.
* @author Richard McCreadie and Craig Macdonald
* @since 2.2
*/
@SuppressWarnings("deprecation")
public class MultiFileCollectionInputFormat extends MultiFileInputFormat<Text, SplitAwareWrapper<Document>>
{

  /** logger for this class */
  protected static final Logger logger = Logger.getLogger(MultiFileCollectionInputFormat.class);
 
  @SuppressWarnings("unchecked")
  @Override
  /**
   * Instantiates a FileCollectionRecordReader using the specified spit (which is
   * assumed to be a CombineFileSplit.
   * @param genericSplit contains files to be processed, assumed to be a CombineFileSplit
   * @param job JobConf of this job
   * @param reported To report progress
   */
  public RecordReader<Text, SplitAwareWrapper<Document>> getRecordReader(
      InputSplit genericSplit,
      JobConf job,
            Reporter reporter)
    throws IOException
  {
    reporter.setStatus(genericSplit.toString());
      return new FileCollectionRecordReader(job, (PositionAwareSplit<CombineFileSplit>) genericSplit);
  }
 
  @SuppressWarnings("unchecked")
  @Override
  /**
   * Splits the input collection into
   * sets of files where each Map task
   * gets about the same number of files
   */
  public InputSplit[] getSplits(JobConf job, int numSplits)
    throws IOException
  {

    Path[] paths = FileInputFormat.getInputPaths(job);
    // HADOOP-1818: Manage splits only if there are paths
    if (paths.length == 0)
    {
            return new InputSplit[0];
        }

   

    if (numSplits>paths.length)
    {
      numSplits = paths.length;
    }
    else if (numSplits<1)
    {
      numSplits = 1;
    }
    //logger.info("Allocating "+paths.length+ " files across "+numSplits +" map tasks");
    List<PositionAwareSplit<CombineFileSplit>> splits = new ArrayList<PositionAwareSplit<CombineFileSplit>>(numSplits);
    final int numPaths = paths.length;
    long[] lengths = new long[numPaths];
    TObjectLongHashMap<String>[] locations = (TObjectLongHashMap<String>[])Array.newInstance(TObjectLongHashMap.class, numPaths);
    long totLength = 0;
    final FileSystem fs = FileSystem.get(job);
    for(int i=0; i<paths.length; i++)
    {
      final FileStatus fss = fs.getFileStatus(paths[i]);
      lengths[i] = fss.getLen();
      totLength += lengths[i];
      final TObjectLongHashMap<String> location2size = locations[i] = new TObjectLongHashMap<String>();
      final long normalblocksize = fss.getBlockSize();
      for(long offset = 0; offset < lengths[i]; offset += normalblocksize)
      {
        final long blocksize = Math.min(offset + normalblocksize, lengths[i]);
        final BlockLocation[] blockLocations = fs.getFileBlockLocations(fss, offset, blocksize);
        for(BlockLocation bl : blockLocations)
        {
          for (String host : bl.getHosts())
          {
            location2size.adjustOrPutValue(host, blocksize, blocksize);
          }
        }
      }
    }
   
    //we need to over-estimate using ceil, to ensure that the last split is not /too/ big
    final int numberOfFilesPerSplit = (int)Math.ceil((double)paths.length / (double)numSplits);
   
    int pathsUsed = 0;
    int splitnum = 0;
    CombineFileSplit mfs;
    // for each split except the last one (which may be smaller than numberOfFilesPerSplit)
    while(pathsUsed < numPaths)
    {
      /* caclulate split size for this task - usually numberOfFilesPerSplit, but
       * less than this for the last split */
      final int splitSizeForThisSplit = numberOfFilesPerSplit + pathsUsed > numPaths
        ? numPaths - pathsUsed
        : numberOfFilesPerSplit;
      //arrays of information for split
      Path[] splitPaths = new Path[splitSizeForThisSplit];
      long[] splitLengths = new long[splitSizeForThisSplit];
      long[] splitStarts = new long[splitSizeForThisSplit];
      final TObjectLongHashMap<String> allLocationsForSplit = new TObjectLongHashMap<String>();
      String[] splitLocations = null; //final recommended locations for this split.
      for(int i=0;i<splitSizeForThisSplit;i++)
      {
        locations[pathsUsed+i].forEachEntry(new  TObjectLongProcedure<String>() {
          public boolean execute(String a, long b)
          {
            allLocationsForSplit.adjustOrPutValue(a, b, b); return true;
          }
        });
        if ( allLocationsForSplit.size() <=3 )
        {
          splitLocations = allLocationsForSplit.keys(new String[allLocationsForSplit.size()]);
        }
        else
        {
          String[] hosts = allLocationsForSplit.keys(new String[allLocationsForSplit.size()]);
           Arrays.sort(hosts, new Comparator<String>() {
                        public int  compare(String o1, String o2) {
                            long diffamount = allLocationsForSplit.get(o1) - allLocationsForSplit.get(o2);
                            if (diffamount > 0)
                            {
                                return -1;
                            }
                            else if (diffamount < 0)
                            {
                                return 1;
                            }
                            return 0;
                        }
                    });
                    splitLocations = new String[3];
                    System.arraycopy(hosts, 0, splitLocations, 0, 3);
        }
      }
     
     
      //copy information for this split
      System.arraycopy(lengths, pathsUsed, splitLengths, 0, splitSizeForThisSplit);
      System.arraycopy(paths, pathsUsed, splitPaths, 0, splitSizeForThisSplit);
      //count the number of paths consumed
      pathsUsed += splitSizeForThisSplit;
     
      //make the actual split object
      ////logger.info("New split of size " + splitSizeForThisSplit);
      mfs = new CombineFileSplit(job, splitPaths, splitStarts, splitLengths, splitLocations);
      splits.add(new PositionAwareSplit<CombineFileSplit>(mfs, splitnum));
      splitnum++;
    }

    if (!(pathsUsed==paths.length)) {
      throw new IOException("Number of used paths does not equal total available paths!");
    }
    return splits.toArray(new PositionAwareSplit[splits.size()]);   
  }

}
TOP

Related Classes of org.terrier.structures.indexing.singlepass.hadoop.MultiFileCollectionInputFormat

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.